package com.hanshg.cherry.task.crawler;

import com.hanshg.cherry.model.SysCrawler;
import com.hanshg.cherry.util.MathSalary;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;
import java.util.regex.Pattern;

/**
 * @ClassName CrawlerPageProcessor
 * @Description TODO
 * @Author 柠檬水
 * @Date 2020/4/23 15:26
 * @Version 1.0
 **/
@Component
@Slf4j
public class CrawlerPageProcessor implements PageProcessor {

    private Site site = Site.me()
            .setCharset("GBK") //设置编码
            .setTimeOut(10 * 1000) //超时时间
            .setRetrySleepTime(3000) //重试间隔时间
            .setRetryTimes(3); //重试的次数

    /**
     * process the page, extract urls to fetch, extract the data and store
     *
     * @param page page
     */
    @Override
    public void process(Page page) {
        List<Selectable> list = page.getHtml().css("div#resultList div.el").nodes();
        if (list.size() == 0) {
            this.savePage(page);
        } else {
            for (Selectable selectable : list) {
                String urlInfo = selectable.links().toString();
                page.addTargetRequest(urlInfo);
            }
            String bkUrl = page.getHtml().css("div.p_in li.bk").nodes().get(1).links().toString();
            page.addTargetRequest(bkUrl);
        }
    }

    /**
     * 保存页面
     *
     * @param page 页面
     */
    private void savePage(Page page) {
        Html html = page.getHtml();
        log.info("开发加载抓取到的信息");
        SysCrawler crawler = new SysCrawler();
        crawler.setCompanyName(html.css("div.cn p.cname a", "text").toString());
        List<Selectable> nodes = html.css("div.bmsg").nodes();
        if (nodes.size() > 1) {
            //只需要有填写公司地址
            crawler.setCompanyAddr(Jsoup.parse(nodes.get(1).toString()).text());
        }
        crawler.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").toString()).text());
        crawler.setJobName(html.css("div.cn h1", "text").toString());

        crawler.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
        crawler.setUrl(page.getUrl().toString());
        Integer[] salary = MathSalary.getSalary(html.css("div.cn strong", "text").toString());
        crawler.setSalaryMin(salary[0]);
        crawler.setSalaryMax(salary[1]);

        String text = html.css("div.cn p.msg", "text").toString();
        Pattern compile = Pattern.compile("[\\u0020|\\u3000|\\u00A0]+");
        String str = compile.matcher(text).replaceAll(",").trim();
        String[] split = str.split(",");
        if (split.length == 5) {
            crawler.setJobAddr(split[0]);//职位地址
            crawler.setWork(split[1]); //工作年限
            crawler.setEducation(split[2]);//学历
            crawler.setRecruit(split[3]); //招收人数
            crawler.setTime(split[4]); //发布日期
        } else if (split.length == 4){
            crawler.setJobAddr(split[0]);//职位地址
            crawler.setEducation(split[1]);//学历
            crawler.setRecruit(split[2]); //招收人数
            crawler.setTime(split[3]); //发布日期
        }else {
            crawler.setJobAddr(split[0]);//职位地址
            crawler.setRecruit(split[1]); //招收人数
            crawler.setTime(split[2]); //发布日期
        }
        //保存
        page.putField("crawler", crawler);
    }

    /**
     * get the site settings
     *
     * @return site
     * @see Site
     */
    @Override
    public Site getSite() {
        return site;
    }
}
